R Studio Cheat Sheets R Markdown Introduction R Markdown Gallery
# This is to be run every time you use this R Markdown document
library(dplyr) # Data Manipulation
library(knitr) # R Markdown files
library(readr) # Reading data into R
library(tidyr) # Data Transformation
getwd()
## [1] "C:/Users/alwalker/Desktop/NEAIR Presentation"
# 1 Open Windows Explorer
# 2 Navigate to C Drive
# 3 Select "Users"
# 4 Select Desktop
# 5 Copy the Address **BE SURE TO REPLACE THE "\" WITH "/"
setwd("c:/Users/alwalker/Desktop/NEAIR Presentation")
# Format of and R object
title <- "You_R_What_You_Code"
year <- 2020
# This file could be read into R without setting a working directory but you would have to run (without #):
# degrees <- read_csv("c:/Users/alwalker/Desktop/NEAIR Presentation/NEAIR_IPEDS_Sample_Data_Set_Degrees.csv").
# This should not be run if you DID set a directory because doing both would cause an error.
# Reading data in (readr package)
# Because you set a working directory, all you'll have to do is refer to the file by name when reading it into R.
degrees <- read_csv("NEAIR_IPEDS_Sample_Data_Set_Degrees.csv")
rm(title, year)
#### Method 2--Through R Studio's Import Data Function (refer to NEAIR's video to view)
#### Method 3--Code for getting data straight from the Internet
degrees <- read.csv(url("https://raw.githubusercontent.com/annlaurawalker/NEAIR_R-R_Session/master/NEAIR_IPEDS_Sample_Data_Set_Degrees.csv"))
colnames(degrees)
## [1] "unitid"
## [2] "institution.name"
## [3] "year"
## [4] "HD2019.State.abbreviation"
## [5] "HD2019.Sector.of.institution"
## [6] "HD2019.Historically.Black.College.or.University"
## [7] "DRVC2019.Associate.s.degree"
## [8] "DRVC2019.Bachelor.s.degree"
## [9] "DRVC2019.Master.s.degree"
## [10] "DRVC2019.Doctor.s.degree...research.scholarship"
## [11] "DRVC2019.Doctor.s.degree...professional.practice"
## [12] "DRVC2019.Doctor.s.degree...other"
## [13] "DRVC2019.Certificates.of.less.than.1.year"
## [14] "DRVC2019.Certificates.of.1.but.less.than.2.years"
## [15] "DRVC2019.Certificates.of.2.but.less.than.4.years"
## [16] "DRVC2019.Postbaccalaureate.certificates"
## [17] "DRVC2019.Post.master.s.certificates"
## [18] "DRVC2019.Number.of.students.receiving.an.Associate.s.degree"
## [19] "DRVC2019.Number.of.students.receiving.a.Bachelor.s.degree"
## [20] "DRVC2019.Number.of.students.receiving.a.Master.s.degree"
## [21] "DRVC2019.Number.of.students.receiving.a.Doctor.s.degree"
## [22] "DRVC2019.Number.of.students.receiving.a.certificate.of.less.than.1.year"
## [23] "DRVC2019.Number.of.students.receiving.a.certificate.of.1.but.less.than.4.years"
## [24] "DRVC2019.Number.of.students.receiving.a.Postbaccalaureate.or.Post.master.s.certificate"
# Column names, as you can see, are long and have blanks and punctuation in the names. Code in the next chunk fixes both problems:
# Changing column names
colnames(degrees) <- c("unitid", "inst_name","year", "state", "inst_sector", "hbcu", "count_assoc_degrees", "count_bach_degrees", "count_mast_degrees", "count_phd_rs_degrees", "count_phd_pp_degrees", "count_phd_other_degrees", "count_certs_less_yr", "count_certs_1_2_yr", "count_certs_2_4_yr", "count_certs_post_bach", "count_certs_post_mast", "count_assoc_students", "count_bach_students", "count_mast_students", "count_phd_students", "count_certs_less_yr_students", "count_certs_1_4_yr_students", "count_certs_post_bach_mast_students")
colnames(degrees)
## [1] "unitid" "inst_name"
## [3] "year" "state"
## [5] "inst_sector" "hbcu"
## [7] "count_assoc_degrees" "count_bach_degrees"
## [9] "count_mast_degrees" "count_phd_rs_degrees"
## [11] "count_phd_pp_degrees" "count_phd_other_degrees"
## [13] "count_certs_less_yr" "count_certs_1_2_yr"
## [15] "count_certs_2_4_yr" "count_certs_post_bach"
## [17] "count_certs_post_mast" "count_assoc_students"
## [19] "count_bach_students" "count_mast_students"
## [21] "count_phd_students" "count_certs_less_yr_students"
## [23] "count_certs_1_4_yr_students" "count_certs_post_bach_mast_students"
# Code below is using the dplyr package
# "%>%" is called a pipe and, in the R language, it's very similar to an if/then statement.
# In prose, the code below means
# Create a new dataframe named "degrees_arranged" THEN
# Take the data frame "degrees" THEN
# Select these fields in this order
degrees_arranged <-
degrees %>%
select(unitid, inst_name, year, count_assoc_degrees, count_assoc_students, count_bach_degrees, count_bach_students, count_mast_degrees, count_mast_students, count_phd_rs_degrees, count_phd_pp_degrees, count_phd_other_degrees, count_phd_students, count_certs_less_yr, count_certs_less_yr_students, count_certs_1_2_yr, count_certs_2_4_yr, count_certs_1_4_yr_students, count_certs_post_bach, count_certs_post_mast, count_certs_post_bach_mast_students)
# Note the both the new data frame in the environment AND number of variables!
# That line didn't include these variables: inst_sector, state, hbcu
# Sometimes you will want to get rid of variables but other times you'll want to keep all.
degrees <- degrees %>% select(year, state, unitid, inst_name, hbcu, inst_sector, count_assoc_degrees, count_assoc_students, count_bach_degrees, count_bach_students, count_mast_degrees, count_mast_students, count_phd_rs_degrees, count_phd_pp_degrees, count_phd_other_degrees, count_phd_students, count_certs_less_yr, count_certs_less_yr_students,count_certs_1_2_yr, count_certs_2_4_yr, count_certs_1_4_yr_students, count_certs_post_bach, count_certs_post_mast, count_certs_post_bach_mast_students)
# rm removes listed items in the environment
rm(degrees_arranged)
head(degrees)
## year state unitid inst_name hbcu
## 1 2019 Maryland 491631 Women's Institute of Torah Seminary No
## 2 2019 Virginia 234155 Virginia State University Yes
## 3 2019 Virginia 231624 William & Mary No
## 4 2019 Maryland 163912 St. Mary's College of Maryland No
## 5 2019 Ohio 200590 ETI Technical College No
## 6 2019 Rhode Island 479062 MotoRing Technical Training Institute No
## inst_sector count_assoc_degrees
## 1 Private not-for-profit, 4-year or above 0
## 2 Public, 4-year or above 0
## 3 Public, 4-year or above 0
## 4 Public, 4-year or above 0
## 5 Private for-profit, 2-year 11
## 6 Private for-profit, less-than 2-year 0
## count_assoc_students count_bach_degrees count_bach_students
## 1 0 45 45
## 2 0 672 672
## 3 0 1653 1653
## 4 0 392 392
## 5 11 NA NA
## 6 0 NA NA
## count_mast_degrees count_mast_students count_phd_rs_degrees
## 1 0 0 0
## 2 113 113 19
## 3 737 734 85
## 4 29 29 0
## 5 NA NA NA
## 6 NA NA NA
## count_phd_pp_degrees count_phd_other_degrees count_phd_students
## 1 0 0 0
## 2 0 0 19
## 3 230 0 315
## 4 0 0 0
## 5 NA NA NA
## 6 NA NA NA
## count_certs_less_yr count_certs_less_yr_students count_certs_1_2_yr
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 91
## 6 0 0 43
## count_certs_2_4_yr count_certs_1_4_yr_students count_certs_post_bach
## 1 0 0 0
## 2 0 0 20
## 3 0 0 9
## 4 0 0 0
## 5 0 91 NA
## 6 0 43 NA
## count_certs_post_mast count_certs_post_bach_mast_students
## 1 0 0
## 2 0 20
## 3 7 16
## 4 0 0
## 5 NA NA
## 6 NA NA
# Example: How to see the 10th row of data:
degrees[10,]
## year state unitid inst_name hbcu inst_sector
## 10 2019 Ohio 457891 Elevate Salon Institute No Private for-profit, 2-year
## count_assoc_degrees count_assoc_students count_bach_degrees
## 10 NA NA NA
## count_bach_students count_mast_degrees count_mast_students
## 10 NA NA NA
## count_phd_rs_degrees count_phd_pp_degrees count_phd_other_degrees
## 10 NA NA NA
## count_phd_students count_certs_less_yr count_certs_less_yr_students
## 10 NA NA NA
## count_certs_1_2_yr count_certs_2_4_yr count_certs_1_4_yr_students
## 10 NA NA NA
## count_certs_post_bach count_certs_post_mast
## 10 NA NA
## count_certs_post_bach_mast_students
## 10 NA
head(degrees, 25)
## year state unitid inst_name hbcu
## 1 2019 Maryland 491631 Women's Institute of Torah Seminary No
## 2 2019 Virginia 234155 Virginia State University Yes
## 3 2019 Virginia 231624 William & Mary No
## 4 2019 Maryland 163912 St. Mary's College of Maryland No
## 5 2019 Ohio 200590 ETI Technical College No
## 6 2019 Rhode Island 479062 MotoRing Technical Training Institute No
## 7 2019 Maryland 444291 Empire Beauty School-Owings Mills No
## 8 2019 Delaware 450298 Strayer University-Delaware No
## 9 2019 Virginia 232919 Tidewater Tech-Trades No
## 10 2019 Ohio 457891 Elevate Salon Institute No
## 11 2019 Maryland 162609 Garrett College No
## 12 2019 Ohio 203535 Kenyon College No
## 13 2019 Massachusetts 166124 College of the Holy Cross No
## 14 2019 Maine 160940 Purdue University Global-Lewiston No
## 15 2019 Maryland 162283 Coppin State University Yes
## 16 2019 Maryland 162168 Chesapeake College No
## 17 2019 Connecticut 455798 Oxford Academy of Hair Design Inc No
## 18 2019 Ohio 202453 Dayton Barber College No
## 19 2019 Connecticut 129923 Lincoln Technical Institute-New Britain No
## 20 2019 Ohio 201867 Cincinnati College of Mortuary Science No
## 21 2019 Massachusetts 164474 Andover Newton Theological School No
## 22 2019 Ohio 493521 Global Tech College No
## 23 2019 Virginia 232265 Hampton University Yes
## 24 2019 Maine 161518 Saint Joseph's College of Maine No
## 25 2019 West Virginia 377652 Valley College-Beckley No
## inst_sector count_assoc_degrees
## 1 Private not-for-profit, 4-year or above 0
## 2 Public, 4-year or above 0
## 3 Public, 4-year or above 0
## 4 Public, 4-year or above 0
## 5 Private for-profit, 2-year 11
## 6 Private for-profit, less-than 2-year 0
## 7 Private for-profit, less-than 2-year 0
## 8 Private for-profit, 4-year or above 13
## 9 Private for-profit, less-than 2-year 0
## 10 Private for-profit, 2-year NA
## 11 Public, 2-year 102
## 12 Private not-for-profit, 4-year or above 0
## 13 Private not-for-profit, 4-year or above 0
## 14 Public, 4-year or above NA
## 15 Public, 4-year or above 0
## 16 Public, 2-year 243
## 17 Private for-profit, less-than 2-year 0
## 18 Private for-profit, 2-year 0
## 19 Private for-profit, less-than 2-year 0
## 20 Private not-for-profit, 4-year or above 66
## 21 Private not-for-profit, 4-year or above NA
## 22 Private for-profit, 2-year 0
## 23 Private not-for-profit, 4-year or above 0
## 24 Private not-for-profit, 4-year or above 4
## 25 Private for-profit, less-than 2-year 0
## count_assoc_students count_bach_degrees count_bach_students
## 1 0 45 45
## 2 0 672 672
## 3 0 1653 1653
## 4 0 392 392
## 5 11 NA NA
## 6 0 NA NA
## 7 0 NA NA
## 8 13 31 31
## 9 0 NA NA
## 10 NA NA NA
## 11 102 NA NA
## 12 0 434 434
## 13 0 698 698
## 14 NA NA NA
## 15 0 378 378
## 16 243 NA NA
## 17 0 NA NA
## 18 0 NA NA
## 19 0 NA NA
## 20 66 48 48
## 21 NA NA NA
## 22 0 NA NA
## 23 0 640 640
## 24 4 311 311
## 25 0 NA NA
## count_mast_degrees count_mast_students count_phd_rs_degrees
## 1 0 0 0
## 2 113 113 19
## 3 737 734 85
## 4 29 29 0
## 5 NA NA NA
## 6 NA NA NA
## 7 NA NA NA
## 8 12 12 0
## 9 NA NA NA
## 10 NA NA NA
## 11 NA NA NA
## 12 0 0 0
## 13 0 0 0
## 14 NA NA NA
## 15 66 66 0
## 16 NA NA NA
## 17 NA NA NA
## 18 NA NA NA
## 19 NA NA NA
## 20 0 0 0
## 21 NA NA NA
## 22 NA NA NA
## 23 141 141 14
## 24 196 196 0
## 25 NA NA NA
## count_phd_pp_degrees count_phd_other_degrees count_phd_students
## 1 0 0 0
## 2 0 0 19
## 3 230 0 315
## 4 0 0 0
## 5 NA NA NA
## 6 NA NA NA
## 7 NA NA NA
## 8 0 0 0
## 9 NA NA NA
## 10 NA NA NA
## 11 NA NA NA
## 12 0 0 0
## 13 0 0 0
## 14 NA NA NA
## 15 4 0 4
## 16 NA NA NA
## 17 NA NA NA
## 18 NA NA NA
## 19 NA NA NA
## 20 0 0 0
## 21 NA NA NA
## 22 NA NA NA
## 23 82 0 96
## 24 0 0 0
## 25 NA NA NA
## count_certs_less_yr count_certs_less_yr_students count_certs_1_2_yr
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 91
## 6 0 0 43
## 7 0 0 34
## 8 0 0 0
## 9 262 262 143
## 10 NA NA NA
## 11 1 1 0
## 12 0 0 0
## 13 0 0 0
## 14 NA NA NA
## 15 0 0 0
## 16 50 50 8
## 17 29 29 25
## 18 0 0 3
## 19 0 0 255
## 20 0 0 0
## 21 NA NA NA
## 22 0 0 6
## 23 0 0 0
## 24 45 45 0
## 25 0 0 75
## count_certs_2_4_yr count_certs_1_4_yr_students count_certs_post_bach
## 1 0 0 0
## 2 0 0 20
## 3 0 0 9
## 4 0 0 0
## 5 0 91 NA
## 6 0 43 NA
## 7 0 34 NA
## 8 0 0 0
## 9 0 143 NA
## 10 NA NA NA
## 11 0 0 NA
## 12 0 0 0
## 13 0 0 0
## 14 NA NA NA
## 15 0 0 0
## 16 0 8 NA
## 17 0 25 NA
## 18 18 21 NA
## 19 0 255 NA
## 20 0 0 0
## 21 NA NA NA
## 22 0 6 NA
## 23 0 0 0
## 24 0 0 5
## 25 0 75 NA
## count_certs_post_mast count_certs_post_bach_mast_students
## 1 0 0
## 2 0 20
## 3 7 16
## 4 0 0
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 8 0 0
## 9 NA NA
## 10 NA NA
## 11 NA NA
## 12 0 0
## 13 0 0
## 14 NA NA
## 15 0 0
## 16 NA NA
## 17 NA NA
## 18 NA NA
## 19 NA NA
## 20 0 0
## 21 NA NA
## 22 NA NA
## 23 5 5
## 24 16 21
## 25 NA NA
tail(degrees, 10)
## year state unitid
## 1154 2019 Massachusetts 431099
## 1155 2019 New Jersey 184056
## 1156 2019 District of Columbia 492102
## 1157 2019 Ohio 407568
## 1158 2019 Ohio 200785
## 1159 2019 Massachusetts 165802
## 1160 2019 Virginia 459082
## 1161 2019 Ohio 485908
## 1162 2019 Maryland 434937
## 1163 2019 New Jersey 451370
## inst_name hbcu
## 1154 Jolie Hair and Beauty Academy-Ludlow No
## 1155 Lincoln Technical Institute-Iselin No
## 1156 Daniel Morgan Graduate School of National Security No
## 1157 Raphael's School of Beauty Culture Inc-Boardman No
## 1158 Herzing University-Akron No
## 1159 Fisher College No
## 1160 Virginia Polytechnic Institute & State University No
## 1161 Antioch University Online No
## 1162 Yeshiva College of the Nations Capital No
## 1163 Yeshivas Be'er Yitzchok No
## inst_sector count_assoc_degrees
## 1154 Private for-profit, less-than 2-year 0
## 1155 Private for-profit, less-than 2-year 0
## 1156 Private not-for-profit, 4-year or above 0
## 1157 Private for-profit, 2-year 0
## 1158 Private not-for-profit, 4-year or above 107
## 1159 Private not-for-profit, 4-year or above 85
## 1160 Public, 4-year or above NA
## 1161 Private not-for-profit, 4-year or above 0
## 1162 Private not-for-profit, 4-year or above 0
## 1163 Private not-for-profit, 4-year or above 0
## count_assoc_students count_bach_degrees count_bach_students
## 1154 0 NA NA
## 1155 0 NA NA
## 1156 0 0 0
## 1157 0 NA NA
## 1158 107 24 24
## 1159 84 245 244
## 1160 NA NA NA
## 1161 0 32 32
## 1162 0 5 5
## 1163 0 20 20
## count_mast_degrees count_mast_students count_phd_rs_degrees
## 1154 NA NA NA
## 1155 NA NA NA
## 1156 17 17 0
## 1157 NA NA NA
## 1158 0 0 0
## 1159 36 36 0
## 1160 NA NA NA
## 1161 0 0 0
## 1162 0 0 0
## 1163 0 0 0
## count_phd_pp_degrees count_phd_other_degrees count_phd_students
## 1154 NA NA NA
## 1155 NA NA NA
## 1156 0 0 0
## 1157 NA NA NA
## 1158 0 0 0
## 1159 0 0 0
## 1160 NA NA NA
## 1161 0 0 0
## 1162 0 0 0
## 1163 0 0 0
## count_certs_less_yr count_certs_less_yr_students count_certs_1_2_yr
## 1154 96 96 28
## 1155 0 0 256
## 1156 0 0 0
## 1157 23 23 16
## 1158 0 0 19
## 1159 0 0 16
## 1160 NA NA NA
## 1161 0 0 0
## 1162 0 0 0
## 1163 0 0 0
## count_certs_2_4_yr count_certs_1_4_yr_students count_certs_post_bach
## 1154 0 28 NA
## 1155 0 256 NA
## 1156 0 0 0
## 1157 20 36 NA
## 1158 0 19 0
## 1159 0 16 0
## 1160 NA NA NA
## 1161 0 0 0
## 1162 0 0 0
## 1163 0 0 0
## count_certs_post_mast count_certs_post_bach_mast_students
## 1154 NA NA
## 1155 NA NA
## 1156 0 0
## 1157 NA NA
## 1158 0 0
## 1159 0 0
## 1160 NA NA
## 1161 0 0
## 1162 0 0
## 1163 0 0
summary(degrees)
## year state unitid inst_name
## Min. :2019 Length:1163 Min. :128498 Length:1163
## 1st Qu.:2019 Class :character 1st Qu.:183076 Class :character
## Median :2019 Mode :character Median :217420 Mode :character
## Mean :2019 Mean :281226
## 3rd Qu.:2019 3rd Qu.:434884
## Max. :2019 Max. :494843
##
## hbcu inst_sector count_assoc_degrees count_assoc_students
## Length:1163 Length:1163 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.0 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.0 Median : 0.00
## Mean : 119.3 Mean : 116.65
## 3rd Qu.: 36.0 3rd Qu.: 35.75
## Max. :5273.0 Max. :5163.00
## NA's :41 NA's :41
## count_bach_degrees count_bach_students count_mast_degrees count_mast_students
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 27.25 1st Qu.: 27.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 242.00 Median : 241.5 Median : 54.0 Median : 54.0
## Mean : 712.80 Mean : 704.2 Mean : 342.0 Mean : 338.5
## 3rd Qu.: 691.25 3rd Qu.: 686.5 3rd Qu.: 278.5 3rd Qu.: 278.5
## Max. :11107.00 Max. :10906.0 Max. :8182.0 Max. :8150.0
## NA's :633 NA's :633 NA's :633 NA's :633
## count_phd_rs_degrees count_phd_pp_degrees count_phd_other_degrees
## Min. : 0.00 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 0.00 Median : 0.00 Median : 0.0000
## Mean : 26.71 Mean : 41.51 Mean : 0.8472
## 3rd Qu.: 3.75 3rd Qu.: 7.00 3rd Qu.: 0.0000
## Max. :886.00 Max. :845.00 Max. :237.0000
## NA's :633 NA's :633 NA's :633
## count_phd_students count_certs_less_yr count_certs_less_yr_students
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 68.95 Mean : 45.98 Mean : 42.14
## 3rd Qu.: 28.75 3rd Qu.: 29.00 3rd Qu.: 29.00
## Max. :1687.00 Max. :5156.00 Max. :3560.00
## NA's :633 NA's :41 NA's :41
## count_certs_1_2_yr count_certs_2_4_yr count_certs_1_4_yr_students
## Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 1.00 Median : 0.000 Median : 6.00
## Mean : 44.69 Mean : 3.583 Mean : 47.39
## 3rd Qu.: 47.75 3rd Qu.: 0.000 3rd Qu.: 51.00
## Max. :1576.00 Max. :437.000 Max. :1575.00
## NA's :41 NA's :41 NA's :41
## count_certs_post_bach count_certs_post_mast
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000
## Mean : 23.01 Mean : 6.889
## 3rd Qu.: 7.00 3rd Qu.: 0.000
## Max. :816.00 Max. :333.000
## NA's :633 NA's :633
## count_certs_post_bach_mast_students
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 29.05
## 3rd Qu.: 14.75
## Max. :815.00
## NA's :633
str(degrees)
## 'data.frame': 1163 obs. of 24 variables:
## $ year : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ state : chr "Maryland" "Virginia" "Virginia" "Maryland" ...
## $ unitid : int 491631 234155 231624 163912 200590 479062 444291 450298 232919 457891 ...
## $ inst_name : chr "Women's Institute of Torah Seminary" "Virginia State University" "William & Mary" "St. Mary's College of Maryland" ...
## $ hbcu : chr "No" "Yes" "No" "No" ...
## $ inst_sector : chr "Private not-for-profit, 4-year or above" "Public, 4-year or above" "Public, 4-year or above" "Public, 4-year or above" ...
## $ count_assoc_degrees : int 0 0 0 0 11 0 0 13 0 NA ...
## $ count_assoc_students : int 0 0 0 0 11 0 0 13 0 NA ...
## $ count_bach_degrees : int 45 672 1653 392 NA NA NA 31 NA NA ...
## $ count_bach_students : int 45 672 1653 392 NA NA NA 31 NA NA ...
## $ count_mast_degrees : int 0 113 737 29 NA NA NA 12 NA NA ...
## $ count_mast_students : int 0 113 734 29 NA NA NA 12 NA NA ...
## $ count_phd_rs_degrees : int 0 19 85 0 NA NA NA 0 NA NA ...
## $ count_phd_pp_degrees : int 0 0 230 0 NA NA NA 0 NA NA ...
## $ count_phd_other_degrees : int 0 0 0 0 NA NA NA 0 NA NA ...
## $ count_phd_students : int 0 19 315 0 NA NA NA 0 NA NA ...
## $ count_certs_less_yr : int 0 0 0 0 0 0 0 0 262 NA ...
## $ count_certs_less_yr_students : int 0 0 0 0 0 0 0 0 262 NA ...
## $ count_certs_1_2_yr : int 0 0 0 0 91 43 34 0 143 NA ...
## $ count_certs_2_4_yr : int 0 0 0 0 0 0 0 0 0 NA ...
## $ count_certs_1_4_yr_students : int 0 0 0 0 91 43 34 0 143 NA ...
## $ count_certs_post_bach : int 0 20 9 0 NA NA NA 0 NA NA ...
## $ count_certs_post_mast : int 0 0 7 0 NA NA NA 0 NA NA ...
## $ count_certs_post_bach_mast_students: int 0 20 16 0 NA NA NA 0 NA NA ...
degrees <- degrees %>% arrange(state, inst_name)
# An example of sorting by descending order--Sorting by state and count of bachelors degrees, highest to lowest
degrees_bach <- degrees %>% arrange(state, (desc(count_bach_degrees)))
# Removing data frames from the environment
rm(degrees_bach)
# Creating a data frame of institutions in New Jersey
new_jersey <- degrees %>% # selects dataframe "degrees"
filter(state == "New Jersey")
# What about multiple conditions?
# Creating a data frame with HBCU institutions in Virginia
virginia_hbcu <- degrees %>%
filter(state == "Virginia", hbcu=="Yes")
virginia_non_hbcu <- degrees %>%
filter(state == "Virginia", hbcu!="Yes")
# What if you wanted to filter for Massachusetts institutions and only wanted student counts and the institution's main info?
# You filter and then select your desired variables
mass_student_count <- degrees %>%
filter(state == "Massachusetts") %>%
select(unitid, inst_name, hbcu, inst_sector, count_assoc_students, count_bach_students, count_mast_students, count_phd_students, count_certs_less_yr_students, count_certs_1_4_yr_students, count_certs_post_bach_mast_students)
# What about numbers? You don't use quotation marks around the filter conditions
# Example below
assoc_deg <- degrees %>%
filter(count_assoc_degrees >= 1000) %>% # where the count of associates degrees is greater than or equal to 1,000
select(unitid, inst_name, state, hbcu, inst_sector, count_assoc_degrees, count_assoc_students) %>% # selects these variables
arrange(desc(count_assoc_degrees)) # and, finally, arranges them in descending order by "count_assoc"
rm(new_jersey, virginia_hbcu, virginia_non_hbcu, mass_student_count, assoc_deg)
# Creating a new data frame for phds using the OR function.
# This data frame filters "degrees" by rows where any of the PhD degree variables is greater than zero.
phd_degrees <- degrees %>%
filter(count_phd_rs_degrees > 0 | count_phd_pp_degrees > 0 | count_phd_other_degrees > 0 ) %>%
select(unitid, inst_name, state, hbcu, inst_sector, count_phd_rs_degrees, count_phd_pp_degrees, count_phd_other_degrees)
# structure of a new variable
## df_name$new_variable_name <- df_name$variable_1 + df_name$variable_2
# Let's create a new variable--Total PhDs
phd_degrees$degrees_total <- phd_degrees$count_phd_rs_degrees + phd_degrees$count_phd_pp_degrees + phd_degrees$count_phd_other_degrees
# Percentage of PhDs that were research and scholarly
# Note: / is used for dividing (* for multiplication, - for subtraction)
phd_degrees$pct_phd_rs <- phd_degrees$count_phd_rs_degrees / phd_degrees$degrees_total
# What do you notice about this new variable?
# Can we round it?
# Of course!
# How to round an already established variable:
phd_degrees$pct_phd_rs <- round(phd_degrees$pct_phd_rs, digits=3)
# How to combine both functions of calculating a percentage and rounding it at the same time
phd_degrees$pct_phd_rs <- round(phd_degrees$count_phd_rs_degrees / phd_degrees$degrees_total, digits=3)
# What if you want it formatted like a percent for a report?
# This is a user-defined function that does just that.
percent <- function(x, digits = 1, format = "f", ...) {
paste0(formatC(x * 100, format = format, digits = digits, ...), "%")
}
# note the _char at the end of this new variable. Once you create a format to a variable, it often ceases to be considered a number.
# During the presentation, line 274 caused an error. That's because I was using the wrong variable for total. View line 249. You'll see that the "total" variable's name is "degrees_total" and, in line 274, I called the total variable "count_phd_degrees_total." Learn from this mistake. The original total variable was going to be the long "count_phd_degrees_total" but I had revised it to something smaller "degrees_total" and I didn't do all the edits. Line 276 has the correct line of code.
phd_degrees$pct_phd_rs_char <- percent(round((phd_degrees$count_phd_rs_degrees / phd_degrees$count_phd_degrees_total), digits=3))
phd_degrees$pct_phd_rs_char <- percent(round((phd_degrees$count_phd_rs_degrees / phd_degrees$degrees_total), digits=3))
# You won't be able to conduct any math operations on character variables
str(phd_degrees)
## 'data.frame': 206 obs. of 11 variables:
## $ unitid : int 128771 129242 129491 130226 130253 130493 128744 129020 436827 463056 ...
## $ inst_name : chr "Central Connecticut State University" "Fairfield University" "Hartford Seminary" "Quinnipiac University" ...
## $ state : chr "Connecticut" "Connecticut" "Connecticut" "Connecticut" ...
## $ hbcu : chr "No" "No" "No" "No" ...
## $ inst_sector : chr "Public, 4-year or above" "Private not-for-profit, 4-year or above" "Private not-for-profit, 4-year or above" "Private not-for-profit, 4-year or above" ...
## $ count_phd_rs_degrees : int 5 0 0 0 0 14 19 345 4 7 ...
## $ count_phd_pp_degrees : int 5 31 0 311 71 0 61 366 4 8 ...
## $ count_phd_other_degrees: int 0 0 5 0 0 0 0 0 0 0 ...
## $ degrees_total : int 10 31 5 311 71 14 80 711 8 15 ...
## $ pct_phd_rs : num 0.5 0 0 0 0 1 0.238 0.485 0.5 0.467 ...
## $ pct_phd_rs_char : chr "50.0%" "0.0%" "0.0%" "0.0%" ...
## Someone asked about the possibility of rounding errors
## To test this, we'll create 3 percent variables rounded to 2 digits
phd_degrees$pct_phd_rs_round <- round(phd_degrees$count_phd_rs_degrees / phd_degrees$degrees_total, digits=2)
phd_degrees$pct_phd_pp_round <- round(phd_degrees$count_phd_pp_degrees / phd_degrees$degrees_total, digits=2)
phd_degrees$pct_phd_other_round <- round(phd_degrees$count_phd_other_degrees / phd_degrees$degrees_total, digits=2)
# Then we'll create the 3 percentage variables without any rounding
phd_degrees$pct_phd_rs <- phd_degrees$count_phd_rs_degrees / phd_degrees$degrees_total
phd_degrees$pct_phd_pp <- phd_degrees$count_phd_pp_degrees / phd_degrees$degrees_total
phd_degrees$pct_phd_other <- phd_degrees$count_phd_other_degrees / phd_degrees$degrees_total
# Finally we'll test them by adding all the rounded variables together (test1) and then adding all the non-rounded variables together (test2)
phd_degrees$test1 <- phd_degrees$pct_phd_rs_round + phd_degrees$pct_phd_pp_round + phd_degrees$pct_phd_other_round
phd_degrees$test2 <- phd_degrees$pct_phd_rs + phd_degrees$pct_phd_pp + phd_degrees$pct_phd_other
# we can see with both variables that they all add up to 1.00 (for test1) and 1 (for test2)
rm(phd_degrees)
# Using dplyr's summarise function
# This is a record count of the institutions by state
count_by_state <- degrees %>%
group_by(state) %>%
summarise (inst_count = n())
# You can add onto this and use a variety of functions
count_by_state_assoc_deg <- degrees %>%
group_by(state) %>%
summarise (inst_count = n(),
sum_assoc_degrees = sum(count_assoc_degrees))
# Look at this data frame--what's wrong with it?
# Why was DC, DE and WV the only states with totals?
test <- degrees %>%
filter(state == "West Virginia" | state == "District of Columbia" | state == "Delaware")
# For whatever reason, WV, DC & DE had 0s listed in the data frame and that's why the sum function could work
# Two ways to fix this
# 1- in the specific data frame:
count_by_state_assoc_deg <- degrees %>%
group_by(state) %>%
summarise (inst_count = n(),
sum_assoc_degrees = sum(count_assoc_degrees, na.rm = TRUE))
# 2- getting rid of the NAs in the data frame
# replacing NAs in just one column
degrees$count_assoc_degrees[is.na(degrees$count_assoc_degrees)] <- 0
# replacing NAs--whole dataframe
degrees[is.na(degrees)] <- 0
# Now that 0s are in place of all NAs in the "degrees" dataframe, the na.rm feature is no longer needed
count_by_state_assoc_bach_deg <- degrees %>%
group_by(state) %>%
summarise (inst_count = n(),
sum_assoc_degrees = sum(count_assoc_degrees),
sum_bach_degrees = sum(count_bach_degrees))
# What if you want sums of all the numeric variables?
total_by_state <- degrees %>% group_by(state) %>% summarise_each(funs(sum)) #spoiler--this will cause an error
## Warning: `summarise_each_()` is deprecated as of dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Error: Problem with `summarise()` input `inst_name`.
## x invalid 'type' (character) of argument
## i Input `inst_name` is `sum(inst_name)`.
## i The error occured in group 1: state = "Connecticut".
# this line says summarise if the variable is numeric
total_by_state <- degrees %>% group_by(state) %>% summarise_if(is.numeric,sum)
# This summed up Unit ID and Year, which isn't necessary or helpful
# The code below changes the 'unitid' and 'year' from numerics to characters
degrees$unitid <- as.character(as.numeric(degrees$unitid))
degrees$year <- as.character(as.numeric(degrees$year))
# Let's try this again
total_by_state_v2 <- degrees %>% group_by(state) %>% summarise_if(is.numeric,sum)
rm(count_by_state, count_by_state_assoc_deg, test, total_by_state, total_by_state_v2)
# extracting the minimum of a field
lowest_masters_count <- degrees %>%
group_by(state) %>%
slice(which.min(count_mast_degrees))
# All that code did was include the first institution listed with a value of 0. Not very helpful.
# Options
# 1- We could create a new data frame where the "count_mast_degrees" is greater than 0 and use that data frame
masters_deg <- degrees %>% filter(count_mast_degrees > 0)
lowest_masters_count_option_1 <- masters_deg %>%
group_by(state) %>%
slice(which.min(count_mast_degrees))
# 2b-When that occurs, usually you have to rearrange the commands. In this case, the filter needs to be above the slicing
lowest_masters_count_option_2 <- degrees %>%
group_by(state) %>%
filter(count_mast_degrees > 0) %>%
slice(which.min(count_mast_degrees))
# slice(which.max), as you can image, gives you the maximum value
rm(lowest_masters_count, masters_deg, lowest_masters_count_option_1, lowest_masters_count_option_2)
# Merge the State name from 'degrees' with its state abbreviation
# I wanted to have an example where we get data from a source online and read it into our R environment without having to download it
# Data extracted from: https://worldpopulationreview.com/states/state-abbreviations
states <- read.csv(url("https://worldpopulationreview.com/static/states/name-abbr.csv"))
# What's wrong with this data frame?
head(states, 3)
## Alabama AL
## 1 Alaska AK
## 2 Arizona AZ
## 3 Arkansas AR
states_v2 <- read.csv(url("https://worldpopulationreview.com/static/states/name-abbr.csv"), header = FALSE)
# What's wrong with this data frame?
head(states_v2, 2)
## V1 V2
## 1 Alabama AL
## 2 Alaska AK
# the good news is we now have ALL states in the data frame but the bad news is the column names aren't meaningful
names(states_v2) <- c("state_name","state_abb")
head(states_v2, 2)
## state_name state_abb
## 1 Alabama AL
## 2 Alaska AK
# yay! The data are in the format we need!
# Merging the two data sets together
degrees <- merge(x=degrees, y=states_v2, by.x="state", by.y="state_name", all.x = TRUE)
# Note: the order of your variables has changed. You may want/need to rearrange them
rm(states, states_v2)
More on Loading Data into R Cookbook for R is a really good source of R Help
# 1- get a list of the unique values of inst_sector
sectors <- distinct(degrees, inst_sector)
sectors
## inst_sector
## 1 Private for-profit, less-than 2-year
## 2 Public, less-than 2-year
## 3 Public, 2-year
## 4 Private not-for-profit, 4-year or above
## 5 Private for-profit, 2-year
## 6 Private for-profit, 4-year or above
## 7 Public, 4-year or above
## 8 Private not-for-profit, 2-year
## 9 Private not-for-profit, less-than 2-year
# Lets say you wanted to divide the "sector" variable into three sections
# 1--public or private
# 2--for-profit or not-for profit
# 3--years
# We'll be doing R's version of Excel's Text to columns but first we have to git rid of spaces between the variables in inst_sector
# R's ifelse
# Lets start with the semantics of an ifelse statement
# data_frame$new_variable <- ifelse("this condition is met", "option1 if condition met", "option2 if condition is not met")
degrees$example <- ifelse(degrees$count_bach_degrees > 0, "Bachelor Degrees Awarded", "No Bachelor Degrees Awarded")
# Here is an example with different options:
degrees$inst_sector_rev <- ifelse(degrees$inst_sector=="Private for-profit, 2-year", "Private/for_profit/2_yr",
ifelse(degrees$inst_sector=="Private for-profit, 4-year or above", "Private/for_profit/4_yr",
ifelse(degrees$inst_sector=="Private for-profit, less-than 2-year", "Private/for_profit/less_than_2_yr",
ifelse(degrees$inst_sector=="Private not-for-profit, 2-year", "Private/not_for_profit/2_yr",
ifelse(degrees$inst_sector=="Private not-for-profit, 4-year or above", "Private/not_for_profit/4_yr",
ifelse(degrees$inst_sector=="Private not-for-profit, less-than 2-year", "Private/not_for_profit/less_than_2_yr",
ifelse(degrees$inst_sector=="Public, 2-year", "Public/not_for_profit/2_yr",
ifelse(degrees$inst_sector=="Public, 4-year or above", "Public/not_for_profit/4_yr", "Public/not_for_profit/less_than_2_yr"))))))))
# we're duplicating this field so you can see the whole picture at the end
degrees$inst_sector_rev_2 <- degrees$inst_sector_rev
# This is R's version of Excel's Text To Columns feature
degrees <- separate(data = degrees, col = inst_sector_rev_2, into = c("inst_sector", "inst_type", "inst_level"), sep = "\\/")
#if you wanted to write a file back to your working directory, you would run this code and it will be placed in your working directory
write.csv(degrees, file = "degrees_rev.csv",row.names=FALSE)